In [117]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)
In [2]:
In [3]:
%%time
df = pd.read_csv("../data/train_2013.csv", index_col=0)
In [4]:
df["date_time"] = pd.to_datetime(df["date_time"], errors="coerce")
In [6]:
# %%time
# skip_col = ["date_time","orig_destination_distance"]
# for col in df_1.columns:
# if col == skip_col:
# pass
# print(col, np.unique(df_1[col].astype(str)))
In [35]:
# check in / check out / distance => nan값 존재
In [9]:
%%time
df = df.reset_index(drop=True)
# 10000명의 데이터만 사용
df = df.ix[:9999]
In [11]:
df.to_csv("train_2013_10000.csv")
In [36]:
df.columns
Out[36]:
In [37]:
cols = df.columns.tolist()[-6:] + df.columns.tolist()[:-6]
df = df[cols]
In [46]:
cols = df.columns.tolist()[:1] + df.columns.tolist()[6:] + df.columns.tolist()[1:6]
df = df[cols]
In [48]:
# 제거할 feature 생각해보기
In [53]:
df.head()
Out[53]:
In [49]:
df.columns
#
Out[49]:
In [68]:
delete_list = ["user_location_city", "user_location_region","is_mobile","is_package","hotel_country","hotel_market"]
In [70]:
df = df.drop(delete_list, axis=1)
In [75]:
print(df.columns, len(df.columns))
In [84]:
df = df.drop(["posa_continent","orig_destination_distance", "srch_destination_type_id"], axis=1)
In [83]:
for col in df.columns:
if col == "date_time":
continue
print(df[col].value_counts())
# df["posa_continent"].value_counts()
In [88]:
# 별다른 Feature Engineering을 하지 않고 제거해서 model을 돌려보자
In [100]:
df["srch_ci"] = pd.to_datetime(df["srch_ci"], errors="coerce")
df["srch_co"] = pd.to_datetime(df["srch_co"], errors="coerce")
In [118]:
le = preprocessing.LabelEncoder()
In [121]:
df["srch_ci"] = le.fit_transform(df["srch_ci"])
df["srch_co"] = le.fit_transform(df["srch_co"])
In [134]:
df["date_time"] = df["date_time"].dt.date
df["date_time"] = le.fit_transform(df["date_time"])
In [135]:
trn_x = df.ix[:,1:]
trn_y = df.ix[:,:1]
In [136]:
model = RandomForestClassifier(max_depth=3, n_jobs=-1, random_state=402)
In [137]:
model.fit(trn_x,trn_y)
Out[137]:
In [153]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(trn_x.shape[1]):
# print(indices[f])
print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x.columns[indices[f]], importances[indices[f]]))
plt.title("Feature importances")
plt.bar(range(trn_x.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x.shape[1]), indices)
plt.xlim([-1, trn_x.shape[1]])
plt.show()
In [ ]:
# 1. feature 11 cnt (0.533188)
# 2. feature 6 srch_co (0.102094)
# 3. feature 7 srch_adults_cnt (0.091235)
# 4. feature 12 hotel_continent (0.050153)
# 5. feature 3 user_id (0.042140)
# 6. feature 4 channel (0.039411)
# 7. feature 8 srch_children_cnt (0.036964)
# 8. feature 10 srch_destination_id (0.032244)
# 9. feature 5 srch_ci (0.030716)
# 10. feature 9 srch_rm_cnt (0.021721)
# 11. feature 1 site_name (0.009355)
# 12. feature 13 hotel_cluster (0.007323)
# 13. feature 0 date_time (0.003447)
# 14. feature 2 user_location_country (0.000008) => 제거
In [154]:
sub_ex = pd.read_csv("../sample_submission.csv")
In [164]:
sub_ex.head()
Out[164]:
In [158]:
trn_x.head()
Out[158]:
In [ ]:
# is_booking이 y라고 생각했는데 다시 생각해보니 hotel_cluster가 중요함
In [163]:
trn_x1 = df.ix[:,:-1]
trn_y1 = df.ix[:,-1:]
model.fit(trn_x1,trn_y1)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(trn_x1.shape[1]):
print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x1.columns[indices[f]], importances[indices[f]]))
plt.title("Feature importances")
plt.bar(range(trn_x1.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x1.shape[1]), indices)
plt.xlim([-1, trn_x1.shape[1]])
plt.show()
# 위로 10000개 잡고, 샘플링, 다시 나오나 보고 변한다면, 데이터가 흔들리는지 확인
# feature 샘플링.
In [ ]:
# Feature ranking:
# 1. feature 13 hotel_continent (0.364650)
# 2. feature 11 srch_destination_id (0.172174)
# 3. feature 2 site_name (0.159535)
# 4. feature 3 user_location_country (0.102945)
# 5. feature 4 user_id (0.077250)
# 6. feature 7 srch_co (0.027816)
# 7. feature 6 srch_ci (0.019617)
# 8. feature 10 srch_rm_cnt (0.017608)
# 9. feature 9 srch_children_cnt (0.017537)
# 10. feature 1 date_time (0.014128)
# 11. feature 8 srch_adults_cnt (0.011914)
# 12. feature 5 channel (0.010902)
# 13. feature 12 cnt (0.003925)
# 14. feature 0 is_booking (0.000000)
# co-ci 기간 변수
# is_booking한 사람의
In [179]:
df.head()
Out[179]: